Kapitel 6.5: Entitäten¶

Das Notebook ergänzt Kapitel 6.5 'Entitäten'.

Import¶

In [1]:
import pandas as pd
import numpy as np

from resources_statistics import *
from resources_geschichtslyrik import *

import plotly.express as px

from tqdm.notebook import tqdm

from scipy.stats import mannwhitneyu
In [2]:
meta = pd.read_json(r"../resources/meta.json")

Merkmale hinzufügen¶

In [3]:
meta['entity_count'] = [0 if pd.isna(x) else x.count('+')+1 for x in meta['entity_simple']]

Korpora¶

Korpora erstellen¶

In [4]:
meta_anth = (
    meta
    .query("corpus=='anth'")
    .query("1850 <= year <= 1918")
    .query("geschichtslyrik == 1")
    .drop_duplicates(subset='author_title')
)
meta_anth_ratings = get_rating_table(meta_anth, mode = 'entity')
meta_anth_bin = binarize_meta(meta_anth)
In [5]:
modcanon_authors = ['Hofmannsthal, Hugo von', 'Rilke, Rainer Maria', 'George, Stefan', 'Heym, Georg']

meta_modcanon = (
    meta
    .query("author in @modcanon_authors")
    .query("1850 <= year <= 1918")
    .query("geschichtslyrik == 1")
    .drop_duplicates(subset='author_title')
)
meta_modcanon_ratings = get_rating_table(meta_modcanon, mode = 'entity')
In [6]:
muench_authors = ['Münchhausen, Börries von', 'Miegel, Agnes', 'Strauß und Torney, Lulu von']

meta_muench = (
    meta
    .query("author in @muench_authors")
    .query("1850 <= year <= 1918")
    .query("geschichtslyrik == 1")
    .drop_duplicates(subset='author_title')
)
meta_muench_ratings = get_rating_table(meta_muench, mode = 'entity')
In [7]:
sub_df = pd.DataFrame()
sub_names = ['Anthologien', 'Kanonisierte Moderne', 'Münchhausen-Kreis']
sub_metas = [meta_anth, meta_modcanon, meta_muench]
sub_ratings = [meta_anth_ratings, meta_modcanon_ratings, meta_muench_ratings]

Merkmale zu Subkorpora hinzufügen¶

In [8]:
for this_name, this_meta, this_ratings in zip(sub_names, sub_metas, sub_ratings):
    sub_df.loc[this_name, 'Jahr'] = round(this_meta['year'].mean(), 0)
        
    sub_df.loc[this_name, 'Texte'] = this_meta.shape[0]
    sub_df.loc[this_name, '1_entity_share'] = this_meta.query("entity_count == 1").shape[0]/this_meta.shape[0]
    sub_df.loc[this_name, '2_entity_share'] = this_meta.query("entity_count == 2").shape[0]/this_meta.shape[0]
    sub_df.loc[this_name, '3_entity_share'] = this_meta.query("entity_count == 3").shape[0]/this_meta.shape[0]
    sub_df.loc[this_name, '4_entity_share'] = this_meta.query("entity_count >= 4").shape[0]/this_meta.shape[0] 
    sub_df.loc[this_name, 'entity_count_mean'] = this_meta['entity_count'].mean()
    
    sub_df.loc[this_name, 'bekanntes_individuum_count'] = ' '.join(this_meta['entity_simple']).count("1")
    sub_df.loc[this_name, 'unbekanntes_individuum_count'] = ' '.join(this_meta['entity_simple']).count("2")
    sub_df.loc[this_name, 'kollektiv_count'] = ' '.join(this_meta['entity_simple']).count("3")
    sub_df.loc[this_name, 'nichtmensch_count'] = ' '.join(this_meta['entity_simple']).count("4")
    sub_df.loc[this_name, 'bekanntes_individuum_share'] = ' '.join(this_meta['entity_simple']).count("1")/this_meta['entity_count'].sum()
    sub_df.loc[this_name, 'unbekanntes_individuum_share'] = ' '.join(this_meta['entity_simple']).count("2")/this_meta['entity_count'].sum()
    sub_df.loc[this_name, 'kollektiv_share'] = ' '.join(this_meta['entity_simple']).count("3")/this_meta['entity_count'].sum()
    sub_df.loc[this_name, 'nichtmensch_share'] = ' '.join(this_meta['entity_simple']).count("4")/this_meta['entity_count'].sum()
    
    sub_df.loc[this_name, 'entity_neutral_share'] = this_ratings.query("rating=='0'").shape[0]/this_meta['entity_count'].sum()
    sub_df.loc[this_name, 'entity_positiv_share'] = this_ratings.query("rating=='1'").shape[0]/this_meta['entity_count'].sum()
    sub_df.loc[this_name, 'entity_negativ_share'] = this_ratings.query("rating=='2'").shape[0]/this_meta['entity_count'].sum()
    sub_df.loc[this_name, 'entity_ambivalent_share'] = this_ratings.query("rating=='3'").shape[0]/this_meta['entity_count'].sum()
    
    sub_df.loc[this_name, 'bekanntes_individuum_neutral_share'] = this_ratings.query("type == '1' and rating=='0'").shape[0]/this_ratings.query("type == '1'").shape[0]
    sub_df.loc[this_name, 'bekanntes_individuum_positiv_share'] = this_ratings.query("type == '1' and rating=='1'").shape[0]/this_ratings.query("type == '1'").shape[0]
    sub_df.loc[this_name, 'bekanntes_individuum_negativ_share'] = this_ratings.query("type == '1' and rating=='2'").shape[0]/this_ratings.query("type == '1'").shape[0]
    sub_df.loc[this_name, 'bekanntes_individuum_ambivalent_share'] = this_ratings.query("type == '1' and rating=='3'").shape[0]/this_ratings.query("type == '1'").shape[0]

    sub_df.loc[this_name, 'unbekanntes_individuum_neutral_share'] = this_ratings.query("type == '2' and rating=='0'").shape[0]/this_ratings.query("type == '2'").shape[0]
    sub_df.loc[this_name, 'unbekanntes_individuum_positiv_share'] = this_ratings.query("type == '2' and rating=='1'").shape[0]/this_ratings.query("type == '2'").shape[0]
    sub_df.loc[this_name, 'unbekanntes_individuum_negativ_share'] = this_ratings.query("type == '2' and rating=='2'").shape[0]/this_ratings.query("type == '2'").shape[0]
    sub_df.loc[this_name, 'unbekanntes_individuum_ambivalent_share'] = this_ratings.query("type == '2' and rating=='3'").shape[0]/this_ratings.query("type == '2'").shape[0]

    sub_df.loc[this_name, 'kollektiv_neutral_share'] = this_ratings.query("type == '3' and rating=='0'").shape[0]/this_ratings.query("type == '3'").shape[0]
    sub_df.loc[this_name, 'kollektiv_positiv_share'] = this_ratings.query("type == '3' and rating=='1'").shape[0]/this_ratings.query("type == '3'").shape[0]
    sub_df.loc[this_name, 'kollektiv_negativ_share'] = this_ratings.query("type == '3' and rating=='2'").shape[0]/this_ratings.query("type == '3'").shape[0]
    sub_df.loc[this_name, 'kollektiv_ambivalent_share'] = this_ratings.query("type == '3' and rating=='3'").shape[0]/this_ratings.query("type == '3'").shape[0]
In [9]:
round(sub_df, 2)
Out[9]:
Jahr Texte 1_entity_share 2_entity_share 3_entity_share 4_entity_share entity_count_mean bekanntes_individuum_count unbekanntes_individuum_count kollektiv_count ... bekanntes_individuum_negativ_share bekanntes_individuum_ambivalent_share unbekanntes_individuum_neutral_share unbekanntes_individuum_positiv_share unbekanntes_individuum_negativ_share unbekanntes_individuum_ambivalent_share kollektiv_neutral_share kollektiv_positiv_share kollektiv_negativ_share kollektiv_ambivalent_share
Anthologien 1875.0 1850.0 0.25 0.45 0.25 0.06 2.12 2025.0 631.0 1064.0 ... 0.14 0.07 0.48 0.34 0.13 0.05 0.31 0.37 0.25 0.06
Kanonisierte Moderne 1903.0 113.0 0.44 0.34 0.20 0.02 1.80 47.0 54.0 82.0 ... 0.22 0.02 0.67 0.19 0.12 0.02 0.40 0.26 0.27 0.07
Münchhausen-Kreis 1905.0 140.0 0.26 0.40 0.23 0.11 2.22 119.0 117.0 62.0 ... 0.13 0.08 0.62 0.30 0.06 0.03 0.44 0.31 0.23 0.03

3 rows × 31 columns

Zeitverlauf¶

In [10]:
ts = pd.DataFrame()
ts.index = pd.Series(range(1850, 1919), name = 'year')
In [11]:
ts['text_count'] = meta_anth.groupby('year').size()
ts['text_count'] = ts['text_count'].fillna(0)
ts['text_sum'] = smooth(ts['text_count'], mode = 'sum')
In [12]:
ts['1_entity_count'] = [meta_anth.query("year == @x and entity_count == 1").shape[0] for x in ts.index]
ts['1_entity_sum'] = smooth(ts['1_entity_count'], mode = 'sum')
ts['1_entity_share_smoothed'] = ts['1_entity_sum']/ts['text_sum']

ts['2_entity_count'] = [meta_anth.query("year == @x and entity_count == 2").shape[0] for x in ts.index]
ts['2_entity_sum'] = smooth(ts['2_entity_count'], mode = 'sum')
ts['2_entity_share_smoothed'] = ts['2_entity_sum']/ts['text_sum']

ts['3_entity_count'] = [meta_anth.query("year == @x and entity_count == 3").shape[0] for x in ts.index]
ts['3_entity_sum'] = smooth(ts['3_entity_count'], mode = 'sum')
ts['3_entity_share_smoothed'] = ts['3_entity_sum']/ts['text_sum']

ts['4_entity_count'] = [meta_anth.query("year == @x and entity_count >= 4").shape[0] for x in ts.index]
ts['4_entity_sum'] = smooth(ts['4_entity_count'], mode = 'sum')
ts['4_entity_share_smoothed'] = ts['4_entity_sum']/ts['text_sum']

ts['total_entity_count'] = [meta_anth.query("year == @x")['entity_count'].sum() for x in ts.index]
ts['total_entity_count_sum'] = smooth(ts['total_entity_count'], mode = 'sum')
ts['total_entity_count_smoothed'] = ts['total_entity_count_sum']/ts['text_sum']

ts['bekanntes_individuum_count'] = [' '.join(meta_anth.query("year == @x")['entity_simple']).count("1") for x in ts.index]
ts['bekanntes_individuum_sum'] = smooth(ts['bekanntes_individuum_count'], mode = 'sum')
ts['bekanntes_individuum_share_smoothed'] = ts['bekanntes_individuum_sum']/ts['total_entity_count_sum']

ts['unbekanntes_individuum_count'] = [' '.join(meta_anth.query("year == @x")['entity_simple']).count("2") for x in ts.index]
ts['unbekanntes_individuum_sum'] = smooth(ts['unbekanntes_individuum_count'], mode = 'sum')
ts['unbekanntes_individuum_share_smoothed'] = ts['unbekanntes_individuum_sum']/ts['total_entity_count_sum']

ts['kollektiv_count'] = [' '.join(meta_anth.query("year == @x")['entity_simple']).count("3") for x in ts.index]
ts['kollektiv_sum'] = smooth(ts['kollektiv_count'], mode = 'sum')
ts['kollektiv_share_smoothed'] = ts['kollektiv_sum']/ts['total_entity_count_sum']

ts['nichtmensch_count'] = [' '.join(meta_anth.query("year == @x")['entity_simple']).count("4") for x in ts.index]
ts['nichtmensch_sum'] = smooth(ts['nichtmensch_count'], mode = 'sum')
ts['nichtmensch_share_smoothed'] = ts['nichtmensch_sum']/ts['total_entity_count_sum']

for year in ts.index:
    this_ratings = get_rating_table(meta = meta_anth.query("year == @year"), mode = 'entity')
    if this_ratings.shape[0] > 0:
        ts.at[year, 'entity_neutral_count'] = this_ratings.query("rating == '0'").shape[0]
        ts.at[year, 'entity_positiv_count'] = this_ratings.query("rating == '1'").shape[0]
        ts.at[year, 'entity_negativ_count'] = this_ratings.query("rating == '2'").shape[0]
        ts.at[year, 'entity_ambivalent_count'] = this_ratings.query("rating == '3'").shape[0]

        ts.at[year, 'bekanntes_individuum_neutral_count'] = this_ratings.query("type == '1' and rating == '0'").shape[0]
        ts.at[year, 'bekanntes_individuum_positiv_count'] = this_ratings.query("type == '1' and rating == '1'").shape[0]
        ts.at[year, 'bekanntes_individuum_negativ_count'] = this_ratings.query("type == '1' and rating == '2'").shape[0]
        ts.at[year, 'bekanntes_individuum_ambivalent_count'] = this_ratings.query("type == '1' and rating == '3'").shape[0]
        
        ts.at[year, 'unbekanntes_individuum_neutral_count'] = this_ratings.query("type == '2' and rating == '0'").shape[0]
        ts.at[year, 'unbekanntes_individuum_positiv_count'] = this_ratings.query("type == '2' and rating == '1'").shape[0]
        ts.at[year, 'unbekanntes_individuum_negativ_count'] = this_ratings.query("type == '2' and rating == '2'").shape[0]
        ts.at[year, 'unbekanntes_individuum_ambivalent_count'] = this_ratings.query("type == '2' and rating == '3'").shape[0]
        
        ts.at[year, 'kollektiv_neutral_count'] = this_ratings.query("type == '3' and rating == '0'").shape[0]
        ts.at[year, 'kollektiv_positiv_count'] = this_ratings.query("type == '3' and rating == '1'").shape[0]
        ts.at[year, 'kollektiv_negativ_count'] = this_ratings.query("type == '3' and rating == '2'").shape[0]
        ts.at[year, 'kollektiv_ambivalent_count'] = this_ratings.query("type == '3' and rating == '3'").shape[0]
        
ts['entity_neutral_sum'] = smooth(ts['entity_neutral_count'], mode = 'sum')
ts['entity_neutral_share_smoothed'] = ts['entity_neutral_sum']/ts['total_entity_count_sum']
ts['entity_positiv_sum'] = smooth(ts['entity_positiv_count'], mode = 'sum')
ts['entity_positiv_share_smoothed'] = ts['entity_positiv_sum']/ts['total_entity_count_sum']
ts['entity_negativ_sum'] = smooth(ts['entity_negativ_count'], mode = 'sum')
ts['entity_negativ_share_smoothed'] = ts['entity_negativ_sum']/ts['total_entity_count_sum']
ts['entity_ambivalent_sum'] = smooth(ts['entity_ambivalent_count'], mode = 'sum')
ts['entity_ambivalent_share_smoothed'] = ts['entity_ambivalent_sum']/ts['total_entity_count_sum']

ts['bekanntes_individuum_neutral_sum'] = smooth(ts['bekanntes_individuum_neutral_count'], mode = 'sum')
ts['bekanntes_individuum_neutral_share_smoothed'] = ts['bekanntes_individuum_neutral_sum']/ts['bekanntes_individuum_sum']
ts['bekanntes_individuum_positiv_sum'] = smooth(ts['bekanntes_individuum_positiv_count'], mode = 'sum')
ts['bekanntes_individuum_positiv_share_smoothed'] = ts['bekanntes_individuum_positiv_sum']/ts['bekanntes_individuum_sum']
ts['bekanntes_individuum_negativ_sum'] = smooth(ts['bekanntes_individuum_negativ_count'], mode = 'sum')
ts['bekanntes_individuum_negativ_share_smoothed'] = ts['bekanntes_individuum_negativ_sum']/ts['bekanntes_individuum_sum']
ts['bekanntes_individuum_ambivalent_sum'] = smooth(ts['bekanntes_individuum_ambivalent_count'], mode = 'sum')
ts['bekanntes_individuum_ambivalent_share_smoothed'] = ts['bekanntes_individuum_ambivalent_sum']/ts['bekanntes_individuum_sum']

ts['unbekanntes_individuum_neutral_sum'] = smooth(ts['unbekanntes_individuum_neutral_count'], mode = 'sum')
ts['unbekanntes_individuum_neutral_share_smoothed'] = ts['unbekanntes_individuum_neutral_sum']/ts['unbekanntes_individuum_sum']
ts['unbekanntes_individuum_positiv_sum'] = smooth(ts['unbekanntes_individuum_positiv_count'], mode = 'sum')
ts['unbekanntes_individuum_positiv_share_smoothed'] = ts['unbekanntes_individuum_positiv_sum']/ts['unbekanntes_individuum_sum']
ts['unbekanntes_individuum_negativ_sum'] = smooth(ts['unbekanntes_individuum_negativ_count'], mode = 'sum')
ts['unbekanntes_individuum_negativ_share_smoothed'] = ts['unbekanntes_individuum_negativ_sum']/ts['unbekanntes_individuum_sum']
ts['unbekanntes_individuum_ambivalent_sum'] = smooth(ts['unbekanntes_individuum_ambivalent_count'], mode = 'sum')
ts['unbekanntes_individuum_ambivalent_share_smoothed'] = ts['unbekanntes_individuum_ambivalent_sum']/ts['unbekanntes_individuum_sum']

ts['kollektiv_neutral_sum'] = smooth(ts['kollektiv_neutral_count'], mode = 'sum')
ts['kollektiv_neutral_share_smoothed'] = ts['kollektiv_neutral_sum']/ts['kollektiv_sum']
ts['kollektiv_positiv_sum'] = smooth(ts['kollektiv_positiv_count'], mode = 'sum')
ts['kollektiv_positiv_share_smoothed'] = ts['kollektiv_positiv_sum']/ts['kollektiv_sum']
ts['kollektiv_negativ_sum'] = smooth(ts['kollektiv_negativ_count'], mode = 'sum')
ts['kollektiv_negativ_share_smoothed'] = ts['kollektiv_negativ_sum']/ts['kollektiv_sum']
ts['kollektiv_ambivalent_sum'] = smooth(ts['kollektiv_ambivalent_count'], mode = 'sum')
ts['kollektiv_ambivalent_share_smoothed'] = ts['kollektiv_ambivalent_sum']/ts['kollektiv_sum']

Anzahl¶

In [13]:
meta_plot = meta_anth['entity_count'].value_counts(normalize=True)

fig = px.bar(
    meta_plot,
    labels = {'value' : '<br>Anteil Texte', 'entity_count' : 'Anzahl Entitäten'},
    # color_discrete_sequence=["gray"]
)

fig.update_layout(
    width=900, height=500,
    xaxis=dict(tickfont=dict(size=16), titlefont=dict(size=16)),
    yaxis=dict(tickfont=dict(size=16), titlefont=dict(size=16)),
    legend=dict(font = dict(size=16), traceorder = 'normal'),
    showlegend=False
)
fig = update_fig_for_publication(fig, make_grey=True)
fig.write_image(f"plots/6.5 Anzahl Entitäten.pdf")
fig.show()
In [14]:
main_feature = 'entity_count'
In [15]:
meta_anth_bin.corr(numeric_only=True)[main_feature].sort_values(ascending = False).head(20)
Out[15]:
entity_count                     1.000000
entity_positiv                   0.398069
bekanntes_individuum_count       0.390660
kollektiv_count                  0.390625
entity_negativ                   0.382087
ballade                          0.341118
unbekanntes_individuum_count     0.334462
words                            0.324643
entity_neutral                   0.306401
sprechakt_erzaehlen_vorhanden    0.303150
ereignis                         0.301161
kollektiv_negativ                0.247066
in_hohem_mass_konkret            0.226377
konkretheit                      0.220873
wissen_ergaenzend                0.204284
bekanntes_individuum_negativ     0.165396
objektmarker_vorhanden           0.160960
marker_count                     0.134463
entity_ambivalent                0.133834
kleinraum_count                  0.114754
Name: entity_count, dtype: float64
In [16]:
meta_anth_bin.corr(numeric_only=True)[main_feature].sort_values(ascending = True).head(20)
Out[16]:
zustand                           -0.332987
sprechakt_beschreiben_vorhanden   -0.229935
wissen_identisch                  -0.229154
nogenre                           -0.199152
sprechakt_behaupten_vorhanden     -0.190417
sprechinstanz_markiert            -0.163892
sprechakte_count                  -0.138117
geschichtsauffassung_positiv      -0.132598
neuzeit                           -0.131967
ueberlieferung_positiv            -0.128348
religion_positiv                  -0.126169
liebe_negativ                     -0.124568
year                              -0.122245
decade                            -0.119999
ende                              -0.109857
gegenwartsbezug                   -0.108620
politik_positiv                   -0.107572
sprechinstanz_in_vergangenheit    -0.107110
rollengedicht                     -0.102574
zeit_mitte                        -0.101660
Name: entity_count, dtype: float64
In [17]:
threshold = 0.3

bin_comp_features = get_features(meta_anth_bin.corr(numeric_only=True)[main_feature], threshold = threshold, mode = 'bin')
cont_comp_features = get_features(meta_anth_bin.corr(numeric_only=True)[main_feature], threshold = threshold, mode = 'cont')
In [18]:
results = relations_contbin(
    meta = meta_anth_bin, 
    main_feature = main_feature,
    comp_features = bin_comp_features
)
In [19]:
directly_related = [
    'entity_positiv', 'entity_negativ', 'entity_ambivalent', 'entity_neutral'
]
results_filtered = (
    results
    .query("index not in @directly_related")
    .query("mannwhitneyu_p < 0.05 and (pointbiserialr_corr >= @threshold or pointbiserialr_corr <= -@threshold)")
    .sort_values(by = 'pointbiserialr_corr', ascending = False)
)
round(results_filtered, 2)
Out[19]:
wenn entity_count = 0: Anteil Texte mit Feature = ... wenn entity_count = 1: Anteil Texte mit Feature = ... wenn entity_count = 2: Anteil Texte mit Feature = ... wenn entity_count = 3: Anteil Texte mit Feature = ... wenn entity_count > 3: Anteil Texte mit Feature = ... pointbiserialr_corr pointbiserialr_p ttest_p cohens_d mannwhitneyu_stat mannwhitneyu_p
ballade 0 [0/0] 0.27233115468409586 [125/459] 0.5846338535414166 [487/833] 0.748898678414097 [340/454] 0.8076923076923077 [84/104] 0.34 0.0 0.0 -0.73 257511.0 0.0
sprechakt_erzaehlen_vorhanden 0 [0/0] 0.5424836601307189 [249/459] 0.7959183673469388 [663/833] 0.9030837004405287 [410/454] 0.9423076923076923 [98/104] 0.30 0.0 0.0 -0.79 182174.0 0.0
ereignis 0 [0/0] 0.5490196078431373 [252/459] 0.8055222088835534 [671/833] 0.9096916299559471 [413/454] 0.9326923076923077 [97/104] 0.30 0.0 0.0 -0.79 177223.0 0.0
zustand 0 [0/0] 0.7037037037037037 [323/459] 0.3817527010804322 [318/833] 0.23348017621145375 [106/454] 0.20192307692307693 [21/104] -0.33 0.0 0.0 0.72 574577.5 0.0
In [20]:
results_other = results.query("index not in @results_filtered.index")
round(results_other.sort_values(by='pointbiserialr_corr', ascending=False), 2)
Out[20]:
wenn entity_count = 0: Anteil Texte mit Feature = ... wenn entity_count = 1: Anteil Texte mit Feature = ... wenn entity_count = 2: Anteil Texte mit Feature = ... wenn entity_count = 3: Anteil Texte mit Feature = ... wenn entity_count > 3: Anteil Texte mit Feature = ... pointbiserialr_corr pointbiserialr_p ttest_p cohens_d mannwhitneyu_stat mannwhitneyu_p
entity_positiv 0 [0/0] 0.5904139433551199 [271/459] 0.45738295318127253 [381/833] 0.33480176211453744 [152/454] 0.21153846153846154 [22/104] 0.40 0.00 0.87 -0.01 222969.0 0.75
entity_negativ 0 [0/0] 0.0915032679738562 [42/459] 0.28331332533013204 [236/833] 0.3920704845814978 [178/454] 0.25961538461538464 [27/104] 0.38 0.00 0.00 -0.60 211593.5 0.00
entity_neutral 0 [0/0] 0.2679738562091503 [123/459] 0.28331332533013204 [236/833] 0.29955947136563876 [136/454] 0.22115384615384615 [23/104] 0.31 0.00 0.00 -0.19 243110.5 0.00
kollektiv_negativ 0 [0/0] 0.11764705882352941 [12/102] 0.2587601078167116 [96/371] 0.397887323943662 [113/284] 0.5416666666666666 [39/72] 0.25 0.00 0.00 -0.54 53095.5 0.00
bekanntes_individuum_negativ 0 [0/0] 0.08118081180811808 [22/271] 0.18252730109204368 [117/641] 0.23466666666666666 [88/375] 0.3375 [27/80] 0.17 0.00 0.00 -0.43 110161.0 0.00
entity_ambivalent 0 [0/0] 0.05010893246187364 [23/459] 0.10684273709483794 [89/833] 0.14977973568281938 [68/454] 0.11538461538461539 [12/104] 0.13 0.00 0.00 -0.35 125649.5 0.00
unbekanntes_individuum_positiv 0 [0/0] 0.2702702702702703 [10/37] 0.4090909090909091 [81/198] 0.4473684210526316 [68/152] 0.48936170212765956 [23/47] 0.10 0.03 0.03 -0.21 20620.5 0.05
stoffgebiet_negativ 0 [0/0] 0.19389978213507625 [89/459] 0.22448979591836735 [187/833] 0.2356828193832599 [107/454] 0.34615384615384615 [36/104] 0.09 0.00 0.00 -0.17 263262.0 0.01
unbekanntes_individuum_negativ 0 [0/0] 0.10810810810810811 [4/37] 0.16161616161616163 [32/198] 0.18421052631578946 [28/152] 0.23404255319148937 [11/47] 0.08 0.11 0.11 -0.20 12090.5 0.13
bekanntes_individuum_positiv 0 [0/0] 0.6752767527675276 [183/271] 0.672386895475819 [431/641] 0.7173333333333334 [269/375] 0.75 [60/80] 0.05 0.08 0.08 -0.11 189307.5 0.09
stoffgebiet_ambivalent 0 [0/0] 0.11982570806100218 [55/459] 0.17406962785114047 [145/833] 0.18722466960352424 [85/454] 0.125 [13/104] 0.05 0.05 0.08 -0.11 211681.5 0.03
kollektiv_positiv 0 [0/0] 0.5098039215686274 [52/102] 0.40161725067385445 [149/371] 0.4295774647887324 [122/284] 0.5277777777777778 [38/72] 0.02 0.63 0.63 -0.03 83444.5 0.75
stoffgebiet_neutral 0 [0/0] 0.17429193899782136 [80/459] 0.16566626650660263 [138/833] 0.19162995594713655 [87/454] 0.18269230769230768 [19/104] -0.02 0.44 0.70 -0.02 230313.5 0.57
stoffgebiet_positiv 0 [0/0] 0.4422657952069717 [203/459] 0.44537815126050423 [371/833] 0.43612334801762115 [198/454] 0.41346153846153844 [43/104] -0.05 0.02 0.23 0.06 357358.0 0.29
In [21]:
result_categories = ['pointbiserialr_corr', 'mannwhitneyu_p']

results_a = relations_contbin(
    meta = meta_anth_bin.query("1850 <= year <= 1884"), 
    main_feature = main_feature,
    comp_features = results_filtered.index
)

results_b = relations_contbin(
    meta = meta_anth_bin.query("1885 <= year <= 1918"), 
    main_feature = main_feature,
    comp_features = results_filtered.index
)

results_merged = results_a[result_categories].join(
    results_b[result_categories], 
    lsuffix='_1850', rsuffix = '_1885'
)
results_merged['diff_of_corrs'] = results_merged['pointbiserialr_corr_1885'] - results_merged['pointbiserialr_corr_1850']

round(results_merged.sort_values(by = 'diff_of_corrs'), 3)
Out[21]:
pointbiserialr_corr_1850 mannwhitneyu_p_1850 pointbiserialr_corr_1885 mannwhitneyu_p_1885 diff_of_corrs
ereignis 0.304 0.0 0.266 0.0 -0.038
sprechakt_erzaehlen_vorhanden 0.308 0.0 0.271 0.0 -0.037
zustand -0.320 0.0 -0.337 0.0 -0.017
ballade 0.327 0.0 0.337 0.0 0.010
In [22]:
results = relations_contbin_ratings(meta_anth_bin, main_feature)
round(results.sort_values(by = 'pointbiserialr_corr'), 2)
  0%|          | 0/14 [00:00<?, ?it/s]
Out[22]:
wenn entity_count = 1: Anteil mit Feature = ... wenn entity_count = 2: Anteil mit Feature = ... wenn entity_count = 3: Anteil mit Feature = ... wenn entity_count = 4: Anteil mit Feature = ... pointbiserialr_corr pointbiserialr_p mannwhitneyu_stat mannwhitneyu_p
bekanntes_individuum_positiv 0.6753 [183/271] 0.5681 [517/910] 0.5368 [357/665] 0.525 [84/160] -0.07 0.00 460867.0 0.00
entity_positiv 0.5904 [271/459] 0.4832 [805/1666] 0.4427 [603/1362] 0.4415 [166/376] -0.07 0.00 1755160.0 0.00
stoffgebiet_positiv 0.5122 [315/615] 0.4656 [535/1149] 0.4416 [276/625] 0.3897 [53/136] -0.06 0.00 748314.5 0.00
kollektiv_positiv 0.5098 [52/102] 0.3661 [160/437] 0.3342 [133/398] 0.4054 [45/111] -0.05 0.11 124671.5 0.09
stoffgebiet_neutral 0.2049 [126/615] 0.188 [216/1149] 0.1808 [113/625] 0.1985 [27/136] -0.03 0.21 480498.5 0.25
entity_ambivalent 0.0501 [23/459] 0.0642 [107/1666] 0.0705 [96/1362] 0.0452 [17/376] -0.00 0.76 449486.0 0.93
unbekanntes_individuum_negativ 0.1081 [4/37] 0.1311 [32/244] 0.1319 [31/235] 0.1313 [13/99] -0.00 0.91 22214.0 0.97
entity_neutral 0.268 [123/459] 0.3001 [500/1666] 0.3018 [411/1362] 0.2926 [110/376] 0.00 0.76 1612919.5 0.58
stoffgebiet_ambivalent 0.1024 [63/615] 0.1401 [161/1149] 0.1552 [97/625] 0.1176 [16/136] 0.03 0.08 397031.0 0.03
unbekanntes_individuum_positiv 0.2703 [10/37] 0.3484 [85/244] 0.3447 [81/235] 0.3232 [32/99] 0.04 0.28 46145.5 0.55
stoffgebiet_negativ 0.1805 [111/615] 0.2063 [237/1149] 0.2224 [139/625] 0.2941 [40/136] 0.07 0.00 582096.0 0.00
bekanntes_individuum_negativ 0.0812 [22/271] 0.1319 [120/910] 0.1474 [98/665] 0.2 [32/160] 0.08 0.00 270031.5 0.00
entity_negativ 0.0915 [42/459] 0.1525 [254/1666] 0.185 [252/1362] 0.2207 [83/376] 0.09 0.00 1193181.5 0.00
kollektiv_negativ 0.1176 [12/102] 0.2197 [96/437] 0.2915 [116/398] 0.3423 [38/111] 0.14 0.00 125138.0 0.00
In [23]:
results = relations_contcont(
    meta = meta_anth_bin, 
    main_feature = main_feature,
    comp_features = cont_comp_features
)
In [24]:
results
Out[24]:
wenn entity_count = 0: Mittelwert Feature = ... wenn entity_count = 1: Mittelwert Feature = ... wenn entity_count = 2: Mittelwert Feature = ... wenn entity_count = 3: Mittelwert Feature = ... wenn entity_count > 3: Mittelwert Feature = ... pearsonr_corr pearsonr_p
bekanntes_individuum_count NaN 0.590414 1.092437 1.464758 1.721154 0.390660 1.666472e-68
words NaN 242.457275 292.849614 369.071942 550.726316 0.324643 1.412541e-43
unbekanntes_individuum_count NaN 0.080610 0.292917 0.517621 1.105769 0.334462 1.370560e-49
kollektiv_count NaN 0.222222 0.524610 0.876652 1.221154 0.390625 1.718298e-68
In [25]:
px.box(
    meta_anth_bin,
    x = 'entity_count',
    y = 'words',
    hover_data = ['author', 'title'],
    labels = {'words' : 'Textlänge in Wörtern', 'entity_count' : 'Anzahl Entitäten'},
    points = 'all'
)
In [26]:
result_categories = ['pearsonr_corr', 'pearsonr_p']

results_a = relations_contcont(
    meta = meta_anth_bin.query("1850 <= year <= 1884"), 
    main_feature = main_feature,
    comp_features = cont_comp_features
)

results_b = relations_contcont(
    meta = meta_anth_bin.query("1885 <= year <= 1918"), 
    main_feature = main_feature,
    comp_features = cont_comp_features
)

results_merged = results_a[result_categories].join(
    results_b[result_categories], 
    lsuffix='_1850', rsuffix = '_1885'
)
results_merged['diff_of_corrs'] = results_merged['pearsonr_corr_1885'] - results_merged['pearsonr_corr_1850']

round(results_merged.sort_values(by = 'diff_of_corrs'), 3)
Out[26]:
pearsonr_corr_1850 pearsonr_p_1850 pearsonr_corr_1885 pearsonr_p_1885 diff_of_corrs
bekanntes_individuum_count 0.416 0.0 0.278 0.0 -0.138
kollektiv_count 0.407 0.0 0.319 0.0 -0.088
words 0.316 0.0 0.316 0.0 -0.000
unbekanntes_individuum_count 0.300 0.0 0.473 0.0 0.173
In [27]:
meta_plot = ts[[
    '1_entity_share_smoothed', 
    '2_entity_share_smoothed', 
    '3_entity_share_smoothed',
    '4_entity_share_smoothed'
]]
meta_plot.columns = [
    'Text mit einer Entität', 
    'Text mit zwei Entitäten', 
    'Text mit drei Entitäten',
    'Text mit vier<br>oder mehr Entitäten'
]
# save_ts_data(meta_plot, prefix='Anzahl_Entitaeten_')

fig = create_ts_plot(
    data = meta_plot, 
    columns = meta_plot.columns, 
    y_axis_title = 'Anteil an Texten',
    add_corporas = sub_df, add_corpora_names = sub_names,
    add_corpora_categories = ['1_entity_share', '2_entity_share', '3_entity_share', '4_entity_share']
)
fig = update_fig_for_publication(fig)
fig.write_image(f"plots/6.5 Anzahl Entitäten im Zeitverlauf (differenziert).pdf")
fig.show()
In [28]:
meta_plot = ts[['total_entity_count_smoothed']]
meta_plot.columns = ['Entitäten pro Text']
# save_ts_data(meta_plot)

fig = create_ts_plot(
    data = meta_plot, 
    columns = meta_plot.columns, 
    y_axis_title = 'Mittelwert Entitäten pro Text',
    add_corporas = sub_df, add_corpora_names = sub_names,
    add_corpora_categories = ['entity_count_mean']
)
fig.update_layout(yaxis_range=[0.9, 2.5])
fig = update_fig_for_publication(fig, make_grey=True)
fig.write_image(f"plots/6.5 Anzahl Entitäten im Zeitverlauf (gesamt).pdf")
fig.show()
In [29]:
# Nur Balladen
meta_test = meta_anth_bin.query("ballade == 1")

early = meta_test.query("1850 <= year <= 1884")['entity_count']
late = meta_test.query("1885 <= year <= 1918")['entity_count']
mannwhitneyu_results = mannwhitneyu(early, late)

print(f"1850–1884 : {len(early)} Texte. Mean entity_count = {early.mean()}")
print(f"1885–1918 : {len(late)} Texte. Mean entity_count = {late.mean()}")
print(f"mannwhitney_u p = {mannwhitneyu_results[1]}")
1850–1884 : 812 Texte. Mean entity_count = 2.4051724137931036
1885–1918 : 224 Texte. Mean entity_count = 2.2767857142857144
mannwhitney_u p = 0.02147667981898758
In [30]:
# Nur gleiche Textlänge
length_bins = [(0, 100), (101, 200), (201, 300), (301, 400), (401, 99999)]
for length_bin in length_bins:
    min_length = length_bin[0]
    max_length = length_bin[1]
    meta_length = meta_anth_bin.query("@min_length <= words <= @max_length")
    
    early = meta_length.query("1850 <= year <= 1884")['entity_count']
    late = meta_length.query("1885 <= year <= 1918")['entity_count']
    mannwhitneyu_results = mannwhitneyu(early, late)

    print(f"{min_length}–{max_length} words")
    print(f"1850–1884 : {len(early)} Texte. Mean entity_count = {early.mean()}")
    print(f"1885–1918 : {len(late)} Texte. Mean entity_count = {late.mean()}")
    print(f"mannwhitney_u p = {mannwhitneyu_results[1]}")
    print("\n")
0–100 words
1850–1884 : 86 Texte. Mean entity_count = 1.744186046511628
1885–1918 : 67 Texte. Mean entity_count = 1.5970149253731343
mannwhitney_u p = 0.20132660417501014


101–200 words
1850–1884 : 282 Texte. Mean entity_count = 1.950354609929078
1885–1918 : 145 Texte. Mean entity_count = 1.7655172413793103
mannwhitney_u p = 0.01845252254663496


201–300 words
1850–1884 : 298 Texte. Mean entity_count = 2.0738255033557045
1885–1918 : 123 Texte. Mean entity_count = 1.9593495934959348
mannwhitney_u p = 0.1798167179883755


301–400 words
1850–1884 : 225 Texte. Mean entity_count = 2.2533333333333334
1885–1918 : 75 Texte. Mean entity_count = 2.066666666666667
mannwhitney_u p = 0.09932173278726612


401–99999 words
1850–1884 : 330 Texte. Mean entity_count = 2.503030303030303
1885–1918 : 92 Texte. Mean entity_count = 2.402173913043478
mannwhitney_u p = 0.19640701343661437


Typen¶

In [31]:
meta_plot = ts[[
    'bekanntes_individuum_share_smoothed', 
    'unbekanntes_individuum_share_smoothed', 
    'kollektiv_share_smoothed',
    'nichtmensch_share_smoothed'
]]
meta_plot.columns = [
    'Bekanntes Individuum', 
    'Unbekanntes Individuum', 
    'Kollektiv',
    'Nicht-menschliche Entität'
]
save_ts_data(meta_plot, prefix = '06_05_Entitaeten_')

fig = create_ts_plot(
    data = meta_plot, 
    columns = meta_plot.columns, 
    y_axis_title = 'Anteil an Entitäten',
    add_corporas = sub_df, add_corpora_names = sub_names,
    add_corpora_categories = ['bekanntes_individuum_share', 'unbekanntes_individuum_share', 'kollektiv_share', 'nichtmensch_share']
)
fig = update_fig_for_publication(fig)
fig.write_image(f"plots/6.5 Entitätentypen im Zeitverlauf.pdf")
fig.show()

Anstieg bekannte Individuen ab 1905¶

In [32]:
meta_time = meta_anth.query("year >= 1905").copy()
top_authors_df = pd.DataFrame()
for author in meta_time['author']:
    meta_author = meta_time.query("author == @author")
    top_authors_df.loc[author, 'text_count'] = meta_author.shape[0]
    top_authors_df.loc[author, 'entity_count'] = meta_author['entity_count'].sum()
    top_authors_df.loc[author, 'bekannes_individuum_share'] = ''.join(meta_author['entity_simple']).count("1")/meta_author['entity_count'].sum()
    
top_authors_df.query("entity_count >= 10").sort_values(by = 'bekannes_individuum_share', ascending=False)
Out[32]:
text_count entity_count bekannes_individuum_share
Blunck, Hans Friedrich 6.0 14.0 0.714286
Lissauer, Ernst 15.0 22.0 0.545455
Gaudy, Alice von 5.0 15.0 0.533333
Schrutz, Demetrius 10.0 21.0 0.476190
Schüler, Gustav 7.0 14.0 0.357143
Geißler, Max 11.0 18.0 0.333333
Hohlbaum, Robert 6.0 12.0 0.333333
Weber, Ernst 7.0 19.0 0.263158
Münchhausen, Börries von 9.0 15.0 0.200000
Löns, Hermann 7.0 16.0 0.062500

Kollektiv-Entitäten in Ergänzungskorpus 'Kanonisierte Moderne'¶

In [33]:
meta_modcanon_bin = binarize_meta(meta_modcanon)
results = pd.DataFrame({
    'texts' : meta_modcanon_bin.groupby('author').size(),
    'entities' : meta_modcanon_bin.groupby('author')['entity_count'].sum(),
    'kollektiv_abs' : meta_modcanon_bin.groupby('author')['kollektiv_count'].sum(),
})
results['kollektiv_rel'] = results['kollektiv_abs']/results['entities']
results
Out[33]:
texts entities kollektiv_abs kollektiv_rel
author
George, Stefan 16 27 12 0.444444
Heym, Georg 43 78 44 0.564103
Hofmannsthal, Hugo von 8 11 3 0.272727
Rilke, Rainer Maria 46 87 23 0.264368
In [34]:
meta_modcanon_ratings.query("author == 'Heym, Georg' and type == '3'")['full'].value_counts().head()
Out[34]:
full
Perser/Soldaten          15
Griechen/Soldaten         7
Franzosen/Soldaten        3
Tote                      2
Pariser Revolutionäre     1
Name: count, dtype: int64

Häufigste Entitäten¶

In [35]:
top_bekannte_individuen = meta_anth_ratings.query("type=='1'")['full'].value_counts()
top_bekannte_individuen = pd.DataFrame(top_bekannte_individuen)
top_bekannte_individuen = top_bekannte_individuen.rename(columns={'full':'count'})
top_bekannte_individuen.query("count >= 10")
Out[35]:
count
full
Friedrich der Große 67
Martin Luther 50
Wilhelm I. (Deutsches Reich) 48
Karl der Große 47
Friedrich I. Barbarossa 31
Napoleon Bonaparte 25
Karl V. (HRR) 22
Friedrich Wilhelm (Brandenburg) 20
Ulrich von Hutten 19
Otto I. (HRR) 18
Konrad IV. (Schwaben) 16
Otto von Bismarck 16
Luise von Mecklenburg-Strelitz 15
Maximilian I. (HRR) 15
Heinrich IV. (HRR) 14
Heinrich I. (Ostfrankenreich) 13
Friedrich II. (HRR) 13
Ludwig IV. (HRR) 12
Maria Theresia 11
Widukind (Sachsen) 11
Napoleon III. 10
Helmuth von Moltke 10
Joseph II. 10
Heinrich der Löwe 10
Gaius Iulius Caesar 10
In [36]:
top_unbekannte_individuen = ' XXX '.join(meta_anth_ratings.query("type=='2'")['full'])
top_unbekannte_individuen = re.sub("/", " XXX ", top_unbekannte_individuen)
top_unbekannte_individuen = top_unbekannte_individuen.split(" XXX ")
top_unbekannte_individuen = pd.Series(top_unbekannte_individuen).value_counts()
top_unbekannte_individuen = pd.DataFrame(top_unbekannte_individuen)
top_unbekannte_individuen = top_unbekannte_individuen.rename(columns={0:'count'})
top_unbekannte_individuen.query("count >= 10")
Out[36]:
count
Soldat 112
Frau 55
Geistlicher 42
Mann 32
Alte Person 28
Junge Person 27
Deutscher 26
Sprechinstanz 21
Ritter 21
Bauer 20
Mensch 17
Sohn 16
Kind 14
Mutter 14
Germane 11
Tote Person 11
Mönch 11
Bote 10
Schweizer 10
Römer 10
In [37]:
top_kollektive = ' XXX '.join(meta_anth_ratings.query("type=='3'")['full'])
top_kollektive = re.sub("/", " XXX ", top_kollektive)
top_kollektive = top_kollektive.split(" XXX ")
top_kollektive = pd.Series(top_kollektive).value_counts()
top_kollektive = pd.DataFrame(top_kollektive)
top_kollektive = top_kollektive.rename(columns={0:'count'})
top_kollektive.query("count >= 10")
Out[37]:
count
Soldaten 289
Deutsche 150
Stadtbevölkerung 57
Römer 47
Österreicher 42
Menschen 34
Preußen 31
Schweizer 31
Franzosen 30
Bauern 25
Sachsen 24
Ritter 23
Geistliche 23
Griechen 21
Germanen 19
Goten 17
Feinde 16
Adlige 16
Türken 15
Heiden 13
Christen 13
Hunnen 12
Fürsten 12
Politiker 11
Ungarn 11
Franken 10
Gläubige 10
In [38]:
# Top bekannte Individuen ohne C. F. Meyer
top_bekannte_individuen = meta_anth_ratings.query("author != 'Meyer, Conrad Ferdinand' and type=='1'")['full'].value_counts()
top_bekannte_individuen = pd.DataFrame(top_bekannte_individuen)
top_bekannte_individuen = top_bekannte_individuen.rename(columns={'full':'count'})
top_bekannte_individuen.head(5)
Out[38]:
count
full
Friedrich der Große 67
Martin Luther 48
Wilhelm I. (Deutsches Reich) 48
Karl der Große 47
Friedrich I. Barbarossa 31
In [39]:
# Sample 100 bekannte Individuen
# top_bekannte_individuen.sample(n=100).to_csv("plots/6.5 bekannte_individuen_sample.csv")
In [40]:
# Bekannte Individuen, die min. 3 mal vorkommen
top_bekannte_individuen = meta_anth_ratings.query("type=='1'")['full'].value_counts()
top_bekannte_individuen = pd.DataFrame(top_bekannte_individuen)
top_bekannte_individuen = top_bekannte_individuen.rename(columns={'full':'count'})
top_bekannte_individuen_list = top_bekannte_individuen.query("count >= 4").index.tolist()

print(len(top_bekannte_individuen_list))
print(' | '.join(top_bekannte_individuen.query("count >= 4").index.tolist()))
92
Friedrich der Große | Martin Luther | Wilhelm I. (Deutsches Reich) | Karl der Große | Friedrich I. Barbarossa | Napoleon Bonaparte | Karl V. (HRR) | Friedrich Wilhelm (Brandenburg) | Ulrich von Hutten | Otto I. (HRR) | Konrad IV. (Schwaben) | Otto von Bismarck | Luise von Mecklenburg-Strelitz | Maximilian I. (HRR) | Heinrich IV. (HRR) | Heinrich I. (Ostfrankenreich) | Friedrich II. (HRR) | Ludwig IV. (HRR) | Maria Theresia | Widukind (Sachsen) | Napoleon III. | Helmuth von Moltke | Joseph II. | Heinrich der Löwe | Gaius Iulius Caesar | Friedrich III. (Deutsches Reich) | Franz Joseph I. | Rudolf I. (HRR) | Eugen von Savoyen | Gebhard Leberecht von Blücher | Karl I. (Neapel) | Johann T’Serclaes von Tilly | Manfred (Sizilien) | Friedrich der Schöne | Alarich | Wallenstein | Attila | Friedrich I. (Brandenburg) | Gustav Adolf | Friedrich Wilhelm IV. | Johannes Gutenberg | Friedrich Wilhelm III. (Preußen) | Alexander der Große | Philipp I. (Hessen) | Alboin | Friedrich Wilhelm I. (Preußen) | Arminius | Nero | Peter der Große | Georg von Derfflinger | Wilhelm II. (Deutsches Reich) | Friedrich Schiller | Tassilo III. | Walther von der Vogelweide | Hannibal | Jan Hus | Friedrich Wilhelm von Seydlitz | Bonifatius | Radbod (Friesland) | Philipp Melanchthon | Joachim I. (Brandenburg) | Xerxes I. | Huldrych Zwingli | Albrecht I. (Brandenburg) | Rosamunde (Gepiden) | Karl von Österreich-Teschen | Friedrich III. (HRR) | Tejas | Christoph Kolumbus | Leopold I. (Habsburg) | Karl der Kühne | Albrecht I. (HRR) | Ludwig II. (Thüringen) | Hans Joachim von Zieten | Rudolf II. (HRR) | Heinrich VI. (HRR) | Albrecht Dürer | Wilhelm von Tegetthoff | Maximilian I. (Mexiko) | Luise Henriette von Oranien | Arnold Winkelried | August Neidhardt von Gneisenau | Henri de La Tour d’Auvergne, vicomte de Turenne | Ferdinand I. (HRR) | Christian von Braunschweig-Wolfenbüttel | Gregor VII. | Heinrich VII. (HRR) | Andreas Hofer | Tiberius | Henning Schindekopf | Friedrich II. (Brandenburg) | Friedrich III. (Sachsen)
In [41]:
print(f"Unbekanntes Individuum 'Soldat'   : {list(top_unbekannte_individuen.loc['Soldat']/top_unbekannte_individuen.sum())[0]}")
print(f"Kollektiv              'Soldaten' : {list(top_kollektive.loc['Soldaten']/top_kollektive.sum())[0]}")
Unbekanntes Individuum 'Soldat'   : 0.12134344528710726
Kollektiv              'Soldaten' : 0.18585209003215433

Bewertung¶

In [42]:
def create_rating_barplot(min_count, rating_df, label):
    meta_plot = rating_df.query("count >= @min_count").sort_values(by='positiv')
    
    fig = px.bar(
        meta_plot,
        title = f"Anzahl {label} Count >= {min_count} : {meta_plot.shape[0]}",
        y = ['neutral', 'positiv', 'negativ', 'ambivalent'],
        hover_data = ['count'],
        labels={'value' : 'Anteil', 'index' : '', 'variable' : ''}
    )
    
    fig.update_layout(height=600, width=1000)
    if min_count <= 5:
        fig.update_layout(width=1200)
        
    fig.show()

Ohne Zeitverlauf¶

In [43]:
results = (
    meta_anth_ratings
    .query("type!='4'")
    .groupby('type')['rating']
    .value_counts(normalize=True)
    .to_frame()
    .reset_index()
    .sort_values(by=['type', 'rating'])
)
add = (
    meta_anth_ratings['rating'].value_counts(normalize=True)
    .to_frame().reset_index()
    .sort_values(by='rating')
)
add['type'] = 'Alle Entitäten'
results = pd.concat([add, results])

results['type'] = results['type'].replace({
    '1' : 'Bekannte Individuen',
    '2' : 'Unbekannte Individuen',
    '3' : 'Kollektive',
    '4' : 'Nicht-menschliche Entitäten',
})
results['rating'] = results['rating'].replace({
    '0' : 'neutral',
    '1' : 'positiv',
    '2' : 'negativ',
    '3' : 'ambivalent',
})
fig = px.bar(
    results,
    x='type',
    y='proportion',
    color='rating',
    barmode='group',
    labels={'type':'','proportion':'Anteil', 'rating':''}
)
fig.update_layout(
    width=950, height=500,
    xaxis=dict(tickfont=dict(size=16), titlefont=dict(size=16)),
    yaxis=dict(tickfont=dict(size=16), titlefont=dict(size=16)),
    legend=dict(font = dict(size=16), traceorder = 'normal', x = 0.8, y = 0.97),
)
fig = update_fig_for_publication(fig)
fig.write_image(f"plots/6.5 Entitätenbewertung im Anthologiekorpus.pdf")
fig.show()
In [44]:
top_bekannte_individuen_list = top_bekannte_individuen.query("count >= 5").index.tolist()
top_bekannte_individuen_ratings = pd.DataFrame(index = top_bekannte_individuen_list)
for entity in top_bekannte_individuen_list:
    this_ratings = meta_anth_ratings.query("full == @entity")
    top_bekannte_individuen_ratings.at[entity, 'count'] = this_ratings.shape[0]
    top_bekannte_individuen_ratings.at[entity, 'neutral'] = this_ratings.query("rating=='0'").shape[0]/this_ratings.shape[0]
    top_bekannte_individuen_ratings.at[entity, 'positiv'] = this_ratings.query("rating=='1'").shape[0]/this_ratings.shape[0]
    top_bekannte_individuen_ratings.at[entity, 'negativ'] = this_ratings.query("rating=='2'").shape[0]/this_ratings.shape[0]
    top_bekannte_individuen_ratings.at[entity, 'ambivalent'] = this_ratings.query("rating=='3'").shape[0]/this_ratings.shape[0]

create_rating_barplot(min_count=10, rating_df = top_bekannte_individuen_ratings, label='Bekannte Individuen')
create_rating_barplot(min_count=5, rating_df = top_bekannte_individuen_ratings, label='Bekannte Individuen')
In [45]:
top_unbekannte_individuen_list = top_unbekannte_individuen.query("count >= 5").index.tolist()
top_unbekannte_individuen_ratings = pd.DataFrame(index = top_unbekannte_individuen_list)
for entity in top_unbekannte_individuen_list:
    this_index = [x for x in meta_anth_ratings.query("type=='2'").index if entity in meta_anth_ratings.loc[x]['full'].split("/")]
    this_ratings = meta_anth_ratings.loc[this_index]
    
    top_unbekannte_individuen_ratings.at[entity, 'count'] = this_ratings.shape[0]
    top_unbekannte_individuen_ratings.at[entity, 'neutral'] = this_ratings.query("rating=='0'").shape[0]/this_ratings.shape[0]
    top_unbekannte_individuen_ratings.at[entity, 'positiv'] = this_ratings.query("rating=='1'").shape[0]/this_ratings.shape[0]
    top_unbekannte_individuen_ratings.at[entity, 'negativ'] = this_ratings.query("rating=='2'").shape[0]/this_ratings.shape[0]
    top_unbekannte_individuen_ratings.at[entity, 'ambivalent'] = this_ratings.query("rating=='3'").shape[0]/this_ratings.shape[0]

create_rating_barplot(min_count=10, rating_df = top_unbekannte_individuen_ratings, label='Unbekannte Individuen')
create_rating_barplot(min_count=5, rating_df = top_unbekannte_individuen_ratings, label='Unbekannte Individuen')
In [46]:
top_kollektive_list = top_kollektive.query("count >= 5").index.tolist()
top_kollektive_ratings = pd.DataFrame(index = top_kollektive_list)
for entity in top_kollektive_list:
    this_index = [x for x in meta_anth_ratings.query("type=='3'").index if entity in meta_anth_ratings.loc[x]['full'].split("/")]
    this_ratings = meta_anth_ratings.loc[this_index]
    
    top_kollektive_ratings.at[entity, 'count'] = this_ratings.shape[0]
    top_kollektive_ratings.at[entity, 'neutral'] = this_ratings.query("rating=='0'").shape[0]/this_ratings.shape[0]
    top_kollektive_ratings.at[entity, 'positiv'] = this_ratings.query("rating=='1'").shape[0]/this_ratings.shape[0]
    top_kollektive_ratings.at[entity, 'negativ'] = this_ratings.query("rating=='2'").shape[0]/this_ratings.shape[0]
    top_kollektive_ratings.at[entity, 'ambivalent'] = this_ratings.query("rating=='3'").shape[0]/this_ratings.shape[0]

create_rating_barplot(min_count=10, rating_df = top_kollektive_ratings, label='Kollektive')
create_rating_barplot(min_count=5, rating_df = top_kollektive_ratings, label='Kollektive')
In [47]:
deutsch = [
    'Deutsche', 'Österreicher', 'Schweizer', 'Sachsen', 'Preußen', 'Germanen', 'Schleswig-Holsteiner',
    'Dithmarscher', 'Bayern', 'Brandenburger'
] # Goten, Tiroler, Franken, Quitzows, Wien
nichtdeutsch = [
    'Römer', 'Franzosen', 'Griechen', 'Türken', 'Hunnen', 'Ungarn', 'Schweden', 'Russen', 'Dänen', 'Welsche'
] # Athen
top_kollektive_ratings['Sprache'] = ['deutschsprachig' if x in deutsch else 'nicht deutschsprachig' if x in nichtdeutsch else float('NaN') for x in top_kollektive_ratings.index]

fig = px.box(
    top_kollektive_ratings.query("Sprache.notna()"),
    y = 'negativ',
    color = 'Sprache',
    points = 'all',
    hover_data = [top_kollektive_ratings.query("Sprache.notna()").index],
    labels = {'negativ' : 'Anteil negative Bewertung'}
)
fig.show()

fig = px.box(
    top_kollektive_ratings.query("Sprache.notna()"),
    y = 'positiv',
    color = 'Sprache',
    points = 'all',
    hover_data = [top_kollektive_ratings.query("Sprache.notna()").index],
    labels = {'positiv' : 'Anteil positive Bewertung'}
)
fig.show()

Mit Zeitverlauf¶

In [48]:
meta_plot = ts[[
    'entity_neutral_share_smoothed', 
    'entity_positiv_share_smoothed', 
    'entity_negativ_share_smoothed',
    'entity_ambivalent_share_smoothed'
]]
meta_plot.columns = [
    'neutral', 
    'positiv', 
    'negativ',
    'ambivalent'
]
# save_ts_data(meta_plot)

fig = create_ts_plot(
    data = meta_plot, 
    columns = meta_plot.columns, 
    y_axis_title = 'Anteil an Entitäten-Bewertungen',
    add_corporas = sub_df, add_corpora_names = sub_names,
    add_corpora_categories = ['entity_neutral_share', 'entity_positiv_share', 'entity_negativ_share', 'entity_ambivalent_share']
)
fig = update_fig_for_publication(fig)
fig.write_image(f"plots/6.5 Entitätenbewertung im Zeitverlauf.pdf")
fig.show()
In [49]:
meta_ballads = meta_anth.query("gattung.str.contains('Ballade', na = False)").copy()
meta_ballads['negative_entity_count'] = [x.count("2") for x in meta_ballads['entity_bewertung']]
meta_ballads['negative_entity_share'] = meta_ballads['negative_entity_count']/meta_ballads['entity_count']
meta_ballads['stoffgebiet_count'] = [x.count("+")+1 for x in meta_ballads['stoffgebiet']]
meta_ballads['negative_stoffgebiet_count'] = [x.count("2") for x in meta_ballads['stoffgebiet_bewertung']]
meta_ballads['negative_stoffgebiet_share'] = meta_ballads['negative_stoffgebiet_count']/meta_ballads['stoffgebiet_count']

results = pd.DataFrame({
    'negative_entity_share' : meta_ballads.groupby('decade')['negative_entity_share'].mean(),
    'negative_stoffgebiet_share' : meta_ballads.groupby('decade')['negative_stoffgebiet_share'].mean(),
})
results
Out[49]:
negative_entity_share negative_stoffgebiet_share
decade
1850.0 0.172016 0.179054
1860.0 0.199735 0.273810
1870.0 0.149568 0.164940
1880.0 0.170139 0.247222
1890.0 0.222222 0.343137
1900.0 0.223214 0.253968
1910.0 0.131250 0.225000
In [50]:
meta_plot = ts[[
    'bekanntes_individuum_neutral_share_smoothed', 
    'bekanntes_individuum_positiv_share_smoothed', 
    'bekanntes_individuum_negativ_share_smoothed',
    'bekanntes_individuum_ambivalent_share_smoothed'
]]
meta_plot.columns = [
    'neutral', 
    'positiv', 
    'negativ',
    'ambivalent'
]
# save_ts_data(meta_plot)

fig = create_ts_plot(
    data = meta_plot, 
    columns = meta_plot.columns, 
    y_axis_title = 'Anteil an Bekannte-Individuen-Bewertungen',
    add_corporas = sub_df, add_corpora_names = sub_names,
    add_corpora_categories = ['bekanntes_individuum_neutral_share', 'bekanntes_individuum_positiv_share', 'bekanntes_individuum_negativ_share', 'bekanntes_individuum_ambivalent_share']
)
fig = update_fig_for_publication(fig)
fig.write_image(f"plots/6.5 Bewertung bekannter Individuen im Zeitverlauf.pdf")
fig.show()
In [51]:
meta_plot = ts[[
    'unbekanntes_individuum_neutral_share_smoothed', 
    'unbekanntes_individuum_positiv_share_smoothed', 
    'unbekanntes_individuum_negativ_share_smoothed',
    'unbekanntes_individuum_ambivalent_share_smoothed'
]]
meta_plot.columns = [
    'neutral', 
    'positiv', 
    'negativ',
    'ambivalent'
]
# save_ts_data(meta_plot)

fig = create_ts_plot(
    data = meta_plot, 
    columns = meta_plot.columns, 
    y_axis_title = 'Anteil an Unbekannte-Individuen-Bewertungen',
    add_corporas = sub_df, add_corpora_names = sub_names,
    add_corpora_categories = ['unbekanntes_individuum_neutral_share', 'unbekanntes_individuum_positiv_share', 'unbekanntes_individuum_negativ_share', 'unbekanntes_individuum_ambivalent_share']
)
fig = update_fig_for_publication(fig)
fig.write_image(f"plots/6.5 Bewertung unbekannter Individuen im Zeitverlauf.pdf")
fig.show()
In [52]:
meta_plot = ts[[
    'kollektiv_neutral_share_smoothed', 
    'kollektiv_positiv_share_smoothed', 
    'kollektiv_negativ_share_smoothed',
    'kollektiv_ambivalent_share_smoothed'
]]
meta_plot.columns = [
    'neutral', 
    'positiv', 
    'negativ',
    'ambivalent'
]
# save_ts_data(meta_plot)

fig = create_ts_plot(
    data = meta_plot, 
    columns = meta_plot.columns, 
    y_axis_title = 'Anteil an Kollektiv-Bewertungen',
    add_corporas = sub_df, add_corpora_names = sub_names,
    add_corpora_categories = ['kollektiv_neutral_share', 'kollektiv_positiv_share', 'kollektiv_negativ_share', 'kollektiv_ambivalent_share']
)
fig = update_fig_for_publication(fig)
fig.write_image(f"plots/6.5 Bewertung Kollektive im Zeitverlauf.pdf")
fig.show()
In [53]:
results = pd.DataFrame()
entity_data_list = [
    ['total_entity_count', 'entity_positiv_count', 'entity_neutral_count', 'Alle Entitäten'],
    ['bekanntes_individuum_count', 'bekanntes_individuum_positiv_count', 'bekanntes_individuum_neutral_count', 'Bekanntes Individuum'],
    ['unbekanntes_individuum_count', 'unbekanntes_individuum_positiv_count', 'unbekanntes_individuum_neutral_count', 'Unbekanntes Individuum'],
    ['kollektiv_count', 'kollektiv_positiv_count', 'kollektiv_neutral_count', 'Kollektiv'],
]

for entity_data in entity_data_list:
    label = entity_data[3]
    early_count = ts.loc[1850:1884][entity_data[0]].sum()
    early_neutral = ts.loc[1850:1884][entity_data[2]].sum()
    early_positiv = ts.loc[1850:1884][entity_data[1]].sum()
    late_count = ts.loc[1885:1918][entity_data[0]].sum()
    late_neutral = ts.loc[1885:1918][entity_data[2]].sum()
    late_positiv = ts.loc[1885:1918][entity_data[1]].sum()

    contingency_table_neutral = [
        [early_neutral, early_count-early_neutral],
        [late_neutral, late_count-late_neutral]]
    chi2_neutral = chi2_contingency(contingency_table_neutral)

    contingency_table_positiv = [
        [early_positiv, early_count-early_positiv],
        [late_positiv, late_count-late_positiv]]
    chi2_positiv = chi2_contingency(contingency_table_positiv)

    results.at[label, 'early_count'] = early_count
    results.at[label, 'early_neutral'] = early_neutral
    results.at[label, 'early_neutral_share'] = early_neutral/early_count
    results.at[label, 'early_positiv'] = early_positiv
    results.at[label, 'early_positiv_share'] = early_positiv/early_count
    results.at[label, 'late_count'] = late_count
    results.at[label, 'late_neutral'] = late_neutral
    results.at[label, 'late_neutral_share'] = late_neutral/late_count
    results.at[label, 'late_positiv'] = late_positiv
    results.at[label, 'late_positiv_share'] = late_positiv/late_count

    results.at[label, 'neutral_chi'] = chi2_neutral[0]
    results.at[label, 'neutral_chi_p'] = chi2_neutral[1]

    results.at[label, 'positiv_chi'] = chi2_positiv[0]
    results.at[label, 'positiv_chi_p'] = chi2_positiv[1]
In [54]:
round(results[[x for x in results.columns if 'positiv' not in x]], 3)
Out[54]:
early_count early_neutral early_neutral_share late_count late_neutral late_neutral_share neutral_chi neutral_chi_p
Alle Entitäten 2932.0 793.0 0.270 984.0 364.0 0.370 34.532 0.000
Bekanntes Individuum 1576.0 342.0 0.217 449.0 117.0 0.261 3.541 0.060
Unbekanntes Individuum 386.0 174.0 0.451 245.0 129.0 0.527 3.149 0.076
Kollektiv 825.0 236.0 0.286 239.0 97.0 0.406 11.818 0.001
In [55]:
round(results[[x for x in results.columns if 'neutral' not in x]], 3)
Out[55]:
early_count early_positiv early_positiv_share late_count late_positiv late_positiv_share positiv_chi positiv_chi_p
Alle Entitäten 2932.0 1467.0 0.500 984.0 404.0 0.411 23.438 0.000
Bekanntes Individuum 1576.0 927.0 0.588 449.0 225.0 0.501 10.453 0.001
Unbekanntes Individuum 386.0 145.0 0.376 245.0 72.0 0.294 4.086 0.043
Kollektiv 825.0 311.0 0.377 239.0 85.0 0.356 0.275 0.600
In [56]:
fig = px.bar(
    ts,
    y = 'kollektiv_count',
    labels = {'kollektiv_count' : 'Anzahl Kollektiv-Entitäten', 'year' : ''}
)
fig.show()
In [57]:
authortitles = meta_anth.query("1897 <= year <= 1900")['author_title']
meta_anth_ratings['author_title'] = meta_anth_ratings['author'] + ' – ' + meta_anth_ratings['title']
results = meta_anth_ratings.query("author_title.isin(@authortitles) and type=='3'")
results = results.sort_values(by= ['rating', 'full'])[[
    'author', 'title', 'full', 'type', 'rating'
]]
print(results.shape[0])
results
28
Out[57]:
author title full type rating
3471 Rüthning, Paul Der Überfall Attentäter 3 0
2568 Münchhausen, Börries von Bauernaufstand Bauern 3 0
3638 Miegel, Agnes Herzog Samo Boten 3 0
3167 Wildenbruch, Ernst von Inschrift an Villa Zirio in San Remo Deutsche 3 0
3560 Miegel, Agnes Die Staufen Deutsche 3 0
3852 Dahn, Felix Das neunzehnte Jahrhundert Deutsche 3 0
3133 Münchhausen, Börries von Der Abschied zu Fontainebleau Franzosen/Soldaten 3 0
3636 Miegel, Agnes Herzog Samo Kinder 3 0
2836 Eckstein, Ernst Nächtliche Stimmen Menschen/Sklaven/Gegenwart 3 0
3412 Frey, Adolf Die Kappelkämpfer Schweizer/Soldaten 3 0
2871 Münchhausen, Börries von Alte Landsknechte Soldaten/Landsknechte 3 0
3470 Rüthning, Paul Der Überfall Soldaten/Landsknechte 3 0
2536 Münchhausen, Börries von Wir. Zu Helm und Schild geboren Adlige 3 1
3534 Miegel, Agnes Henning Schindekopf. 2. Königsberg Bauern 3 1
2358 Diemar, Adamine von Zum Todestage der Kaiserin Augusta Deutsche 3 1
2362 Liliencron, A. von Es tönen die Glocken weit hin durch das Reich Deutsche 3 1
3329 Wilbrandt, Adolf von Im neuen Jahrhundert Deutsche 3 1
2535 Münchhausen, Börries von Wir. Zu Helm und Schild geboren Deutsche/Bauern 3 1
2545 Scholz, Wilhelm von Der Strauchritter Knechte 3 1
3384 Curti, Theod. Im Tale Schwyz Landbevölkerung 3 1
3328 Lingg, Hermann Dem neuen Jahrhundert Menschen 3 1
3614 George, Stefan Urlandschaft Menschen 3 1
2600 Wickenburg, Albrecht von Das letzte Aufgebot Soldaten 3 1
3559 Miegel, Agnes Die Staufen Staufer 3 1
3014 Puttkamer, Alberta von Nachts am Lügenfeld Söhne 3 1
2593 Wickenburg, Albrecht von Die Gelbschnäbel von Kolin Österreicher/Soldaten 3 1
3202 Wickenburg, Albrecht von Des Sandwirts Heimkehr Österreicher/Soldaten/Offiziere 3 1
2544 Scholz, Wilhelm von Der Strauchritter Räuber 3 2

Weiteres¶

In [58]:
meta_heroism = meta_anth_bin.copy()
meta_heroism['bekanntes_individuum_vorhanden'] = [1 if x > 0 else 0 for x in meta_heroism['bekanntes_individuum_count']]
meta_heroism['unbekanntes_individuum_vorhanden'] = [1 if x > 0 else 0 for x in meta_heroism['unbekanntes_individuum_count']]
meta_heroism['kollektiv_vorhanden'] = [1 if x > 0 else 0 for x in meta_heroism['kollektiv_count']]
meta_heroism['bekanntes_individuum_positiv'] = meta_heroism['bekanntes_individuum_positiv'].replace({float('NaN'):0})
meta_heroism['unbekanntes_individuum_positiv'] = meta_heroism['unbekanntes_individuum_positiv'].replace({float('NaN'):0})
meta_heroism['kollektiv_positiv'] = meta_heroism['kollektiv_positiv'].replace({float('NaN'):0})

results = pd.DataFrame()
for this_main_feature in ['bekanntes_individuum_vorhanden', 
                          'unbekanntes_individuum_vorhanden', 
                          'kollektiv_vorhanden',
                          'bekanntes_individuum_positiv', 
                          'unbekanntes_individuum_positiv', 
                          'kollektiv_positiv',
                         ]:
    this_results = relations_binbin(
        meta = meta_heroism, 
        main_feature = this_main_feature,
        comp_features = ['heroismus']
    )
    this_results['main_feature'] = this_main_feature
    results = pd.concat([results, this_results])
In [59]:
round(results[['main_feature'] + [x for x in results.columns]], 3)
Out[59]:
main_feature wenn_nicht wenn_nicht_detail wenn_ja wenn_ja_detail diff_low_bootstrap diff_low diff diff_high diff_high_bootstrap chi2 chi2_p fisher_p phi min_real min_expected main_feature
heroismus bekanntes_individuum_vorhanden 0.207 100/483 0.309 423/1367 0.058 0.059 0.102 0.146 0.144 18.454 0.000 0.000 0.100 100.0 136.545 bekanntes_individuum_vorhanden
heroismus unbekanntes_individuum_vorhanden 0.308 436/1416 0.200 87/434 -0.152 -0.152 -0.107 -0.063 -0.064 18.913 0.000 0.000 0.101 87.0 122.693 unbekanntes_individuum_vorhanden
heroismus kollektiv_vorhanden 0.261 266/1021 0.310 257/829 0.011 0.008 0.049 0.091 0.088 5.525 0.019 0.020 0.055 257.0 234.361 kollektiv_vorhanden
heroismus bekanntes_individuum_positiv 0.165 150/907 0.396 373/943 0.192 0.191 0.230 0.270 0.272 120.781 0.000 0.000 0.256 150.0 256.411 bekanntes_individuum_positiv
heroismus unbekanntes_individuum_positiv 0.276 460/1668 0.346 63/182 -0.001 -0.002 0.070 0.143 0.140 4.008 0.045 0.056 0.047 63.0 51.452 unbekanntes_individuum_positiv
heroismus kollektiv_positiv 0.248 369/1489 0.427 154/361 0.118 0.123 0.179 0.234 0.235 45.795 0.000 0.000 0.157 154.0 102.056 kollektiv_positiv

Anthologien und bekannte Individuen¶

In [60]:
anthology_df = pd.DataFrame()
for anthology in meta.query("anthology_year_used_ed <= 2000")['anthology'].unique():
    anthology_meta = meta.query("anthology == @anthology")
    anthology_df.at[anthology, 'entity_count'] = anthology_meta['entity_count'].sum()
    anthology_df.at[anthology, 'bekannte_individuen_count'] = ' '.join(anthology_meta.query("entity_simple.notna()")['entity_simple']).count("1")
anthology_df['bekannte_individuen_share'] = anthology_df['bekannte_individuen_count']/anthology_df['entity_count']

anthology_df_short = anthology_df.query("entity_count >= 30").copy()
anthology_df_short['rank'] = anthology_df_short['bekannte_individuen_share'].rank(ascending=False)
anthology_df_short['anthologies_higher'] = anthology_df_short['rank']-1
anthology_df_short['anthologies_lower'] = anthology_df_short.shape[0]-anthology_df_short['rank']
In [61]:
px.box(
    anthology_df_short,
    y = 'bekannte_individuen_share',
    points='all',
    hover_data = [anthology_df_short.index],
    labels = {'bekannte_individuen_share' : 'Anteil bekannte Individuen'}
)
In [62]:
anthology_df_short.loc['1915.Eggert-Windegg']
Out[62]:
entity_count                 460.000000
bekannte_individuen_count    208.000000
bekannte_individuen_share      0.452174
rank                          43.000000
anthologies_higher            42.000000
anthologies_lower             28.000000
Name: 1915.Eggert-Windegg, dtype: float64